library(tidyverse)
## Warning: パッケージ 'tidyverse' はバージョン 4.3.1 の R の下で造られました
## Warning: パッケージ 'purrr' はバージョン 4.3.3 の R の下で造られました
## Warning: パッケージ 'dplyr' はバージョン 4.3.2 の R の下で造られました
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(purrr)
library(stringr)
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
#データ整理と変換
##(a)Semester Dataの整形
#csvファイルの読み込み。デフォルトで一行目が列名として扱われる。
semester1 <- read_csv("semester_data_1.csv")
## Rows: 6632 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): x1, x2, x3, x4, x5, x6
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
semester2 <- read_csv("semester_data_2.csv")
## Rows: 7258 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): x2
## dbl (5): x1, x3, x4, x5, x6
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#列ごとの型を確認する。1と2で型が違うため揃える必要があると判断。
str(semester1)
## spc_tbl_ [6,632 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ x1: chr [1:6632] "unitid" "100654" "100654" "100654" ...
## $ x2: chr [1:6632] "instnm" "ALABAMA A&M UNIVERSITY" "ALABAMA A&M UNIVERSITY" "ALABAMA A & M UNIVERSITY" ...
## $ x3: chr [1:6632] "semester" "1" "1" "1" ...
## $ x4: chr [1:6632] "quarter" "0" "0" "0" ...
## $ x5: chr [1:6632] "year" "1991" "1992" "1993" ...
## $ x6: chr [1:6632] "Y" "1991" "1992" "1993" ...
## - attr(*, "spec")=
## .. cols(
## .. x1 = col_character(),
## .. x2 = col_character(),
## .. x3 = col_character(),
## .. x4 = col_character(),
## .. x5 = col_character(),
## .. x6 = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(semester2)
## spc_tbl_ [7,258 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ x1: num [1:7258] 182634 182634 182634 182634 182634 ...
## $ x2: chr [1:7258] "COLBY-SAWYER COLLEGE" "COLBY-SAWYER COLLEGE" "COLBY-SAWYER COLLEGE" "COLBY-SAWYER COLLEGE" ...
## $ x3: num [1:7258] 1 1 1 1 1 1 1 1 1 1 ...
## $ x4: num [1:7258] 0 0 0 0 0 0 0 0 0 0 ...
## $ x5: num [1:7258] 1991 1992 1993 1995 1996 ...
## $ x6: num [1:7258] 1991 1992 1993 1995 1996 ...
## - attr(*, "spec")=
## .. cols(
## .. x1 = col_double(),
## .. x2 = col_character(),
## .. x3 = col_double(),
## .. x4 = col_double(),
## .. x5 = col_double(),
## .. x6 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#型を揃える。character型に統一したほうが良いと判断。
semester2 <- semester2 %>% mutate(across(-x2, as.character))
#結合。
semester_bind <- bind_rows(semester1,semester2)
#一行目を取り除き、二行目を列名にする。
semester_bind <- semester_bind %>%
slice(-1) %>%
set_names(as.character(semester_bind[1, ]))
#Y列を削除。
semester_bind <- semester_bind %>% select(-c("Y"))
# semester制が導入された年の列を作成
semester_bind <- semester_bind %>%
group_by(unitid) %>%
mutate(
yearsem = ifelse(
any(semester == 0) & any(semester == 1),
unique(year[semester == 1 & lag(semester) == 0]),
NA
)
) %>%
ungroup()
#semester制導入後を示すafter列を作成
semester_bind <- semester_bind %>%
mutate(
after = ifelse(
year >= yearsem,
1,
0
)
)
DT::datatable(semester_bind)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
#1994年以外の年の.xlsファイルがあるため、paste0()でディレクトリをまとめてしまう
available_years <- setdiff(1991:2014, c(1994))
file_paths <- paste0("C:/Users/nasuk/OneDrive/デスクトップ/ra_bootcamp_warmp/", available_years, ".xlsx")
#まとめたディレクトリを用いて、一度に結合する
gradrate_bind <- file_paths %>%
map(~ read_excel(.x)) %>%
bind_rows()
#女子学生の4年卒業率に0.01をかけて、0から1のスケールに変更
gradrate_bind <- gradrate_bind %>%
mutate(womengradrate4yr = women_gradrate_4yr * 0.01)
#データ型を確認。num型に揃える必要があるため、変換する。
str(gradrate_bind)
## tibble [18,883 × 10] (S3: tbl_df/tbl/data.frame)
## $ unitid : num [1:18883] 100654 100663 100751 100858 101435 ...
## $ year : num [1:18883] 1991 1991 1991 1991 1991 ...
## $ totcohortsize : chr [1:18883] "1010" "937" "2511" "3024" ...
## $ w_cohortsize : num [1:18883] 527 500 1348 1496 101 ...
## $ m_cohortsize : num [1:18883] 483 437 1163 1528 88 ...
## $ tot4yrgrads : num [1:18883] 152 82 630 846 60 109 73 66 217 117 ...
## $ m_4yrgrads : chr [1:18883] "32" "33" "213" "312" ...
## $ w_4yrgrads : num [1:18883] 120 49 417 534 35 67 32 43 128 85 ...
## $ women_gradrate_4yr: num [1:18883] 22.8 9.8 30.9 35.7 34.6 ...
## $ womengradrate4yr : num [1:18883] 0.228 0.098 0.309 0.357 0.346 ...
gradrate_bind <- gradrate_bind %>%
mutate(
totcohortsize = as.numeric(totcohortsize),
m_4yrgrads = as.numeric(m_4yrgrads)
)
#男性と全体の卒業率を計算
gradrate_bind <- gradrate_bind %>%
mutate(totgradrate4yr = tot4yrgrads / totcohortsize)
gradrate_bind <- gradrate_bind %>%
mutate(mengradrate4yr = m_4yrgrads / m_cohortsize)
#有効数字3桁を指定。DTで見るとところどころ有効係数が変わっているようにも見えるが…
gradrate_bind <- gradrate_bind %>%
mutate(
totgradrate4yr = sprintf("%.3g", totgradrate4yr),
mengradrate4yr = sprintf("%.3g", mengradrate4yr)
)
#2010年までの行のみ抽出
gradrate_bind <- gradrate_bind %>%
filter(year <= 2010)
DT::datatable(gradrate_bind)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
##(c)Covariates Dataの整形
#生データ読み込み
covariates <- read_excel("C:/Users/nasuk/OneDrive/デスクトップ/ra_bootcamp_warmp/covariates.xlsx")
#列名の変更
covariates <- covariates %>% rename(unitid = university_id)
#"aaaa"の削除
covariates$unitid <- covariates$unitid %>% str_remove_all("aaaa")
#wide型に変換
covariates <- covariates %>% pivot_wider(names_from = category, values_from = value)
outcome 1991-2016 1994なし 1991-2010にフィルター semester 1991-2010
#年の範囲を揃える
covariates <- covariates %>% filter(year>=1991, year<=2010, year != 1994)
#gradrateにおけるunitidの一覧を作る
unitids_vector <- unique(gradrate_bind$unitid)
#gradrateと一致するunitidのみ抽出
covariates <- covariates %>% filter(unitid %in% unitids_vector)
DT::datatable(covariates)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
#列の結合に用いるunitidとyearの型が一致せず。gradrate_bindのみnumなのでcharに揃える。
str(semester_bind)
## tibble [13,889 × 7] (S3: tbl_df/tbl/data.frame)
## $ unitid : chr [1:13889] "100654" "100654" "100654" "100654" ...
## $ instnm : chr [1:13889] "ALABAMA A&M UNIVERSITY" "ALABAMA A&M UNIVERSITY" "ALABAMA A & M UNIVERSITY" "ALABAMA A & M UNIVERSITY" ...
## $ semester: chr [1:13889] "1" "1" "1" "1" ...
## $ quarter : chr [1:13889] "0" "0" "0" "0" ...
## $ year : chr [1:13889] "1991" "1992" "1993" "1995" ...
## $ yearsem : chr [1:13889] NA NA NA NA ...
## $ after : num [1:13889] NA NA NA NA NA NA NA NA NA NA ...
str(gradrate_bind)
## tibble [13,889 × 12] (S3: tbl_df/tbl/data.frame)
## $ unitid : num [1:13889] 100654 100663 100751 100858 101435 ...
## $ year : num [1:13889] 1991 1991 1991 1991 1991 ...
## $ totcohortsize : num [1:13889] 1010 937 2511 3024 189 ...
## $ w_cohortsize : num [1:13889] 527 500 1348 1496 101 ...
## $ m_cohortsize : num [1:13889] 483 437 1163 1528 88 ...
## $ tot4yrgrads : num [1:13889] 152 82 630 846 60 109 73 66 217 117 ...
## $ m_4yrgrads : num [1:13889] 32 33 213 312 25 42 41 23 89 32 ...
## $ w_4yrgrads : num [1:13889] 120 49 417 534 35 67 32 43 128 85 ...
## $ women_gradrate_4yr: num [1:13889] 22.8 9.8 30.9 35.7 34.6 ...
## $ womengradrate4yr : num [1:13889] 0.228 0.098 0.309 0.357 0.346 ...
## $ totgradrate4yr : chr [1:13889] "0.15" "0.0875" "0.251" "0.28" ...
## $ mengradrate4yr : chr [1:13889] "0.0663" "0.0755" "0.183" "0.204" ...
str(covariates)
## tibble [13,889 × 6] (S3: tbl_df/tbl/data.frame)
## $ unitid : chr [1:13889] "100654" "100654" "100654" "100654" ...
## $ year : chr [1:13889] "1991" "1992" "1993" "1995" ...
## $ instatetuition : chr [1:13889] "1298" "1600" "1600" "2022" ...
## $ costs : chr [1:13889] "53.121007" "52.536624" "50.39972" "68.024267" ...
## $ faculty : chr [1:13889] "223" "267" "262" "298" ...
## $ white_cohortsize: chr [1:13889] "11" "5" "7" "18" ...
gradrate_bind <- gradrate_bind %>%
mutate(
unitid = as.character(unitid),
year = as.character(year)
)
#unitidとyearで結合してマスターデータ作成
master <- semester_bind %>%
left_join(gradrate_bind, by =c("unitid","year")) %>%
left_join(covariates, by =c("unitid","year") )
DT::datatable(master)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
#分析
##(a)記述統計
##(b)回帰分析
model <- lm(totgradrate4yr ~ after, data=master)
stargazer(model)
##
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: 日, 8 18, 2024 - 21:31:03
## \begin{table}[!htbp] \centering
## \caption{}
## \label{}
## \begin{tabular}{@{\extracolsep{5pt}}lc}
## \\[-1.8ex]\hline
## \hline \\[-1.8ex]
## & \multicolumn{1}{c}{\textit{Dependent variable:}} \\
## \cline{2-2}
## \\[-1.8ex] & totgradrate4yr \\
## \hline \\[-1.8ex]
## after & 0.031$^{**}$ \\
## & (0.012) \\
## & \\
## Constant & 0.251$^{***}$ \\
## & (0.010) \\
## & \\
## \hline \\[-1.8ex]
## Observations & 1,045 \\
## R$^{2}$ & 0.006 \\
## Adjusted R$^{2}$ & 0.005 \\
## Residual Std. Error & 0.183 (df = 1043) \\
## F Statistic & 6.429$^{**}$ (df = 1; 1043) \\
## \hline
## \hline \\[-1.8ex]
## \textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\
## \end{tabular}
## \end{table}